org 100h        ; assume al=0 bx=0 sp=di=-2 si=0100h bp=09??h; last 16 bytes of PSP = 0
   ; dw -16     ; F0FF40C2: lock inc word [bx+si-0x3e]
   db 0x84,0xC2 ;=test dl,al
PN equ $-4*5    ; dd 0.0, -1.0, /*0.0*/  ; fallthrough
RO equ $-4*3    ; dd 0.0, 0.0, -66.1
I:mov word[byte PN+4+2 + si-100h],0xBF80

;Video mode + palette: 4 bits orange * 4 bits blue. Uses default index 0 (black).
  pusha
  mov al,13h
P:int 10h      ; set video mode | set palette index: bx=i dh=R ch=G cl=B
  inc bx
  mov al,bl
  aam 16       ; ax = ....rrrr....bbbb
  imul dx,ax,4
  mov ax,1010h
  mov cx,dx
  add ch,cl
BIG equ $+1
  shr ch,1     ; ch=G = (R+B)/2
  jnz P        ; dx=cx=0 bx=100h ax=1010h
  popa         ;cf=0

M:          ;OUTER REGS: CX=T DI=segment=YX AX=0 BX=0
  mov di,0xA000-10-20-20-20-6  ; visible pixels are A0000..AF9FF
  pusha         ; YX:XX must be neighbors after 2*PUSHA (DI:ax are OK; add ax,NN is smaller)
  mov es,di     ; want X=0 Y=0 in the center of the screen

  xor di,di     ;pixel address (can be any addressing register)
X:          ;inner regs: ax=XX di=adr_pixel
        
  fninit        ; adr:     -34 -32 -30 -28 -26 -24 -22 -20 -18 -16 -14 -12 -10  -8  -6  -4  -2
  pusha         ; stack:    di  si  bp  sp  bx  dx  cx  ax  DI  SI  BP  SP  BX  DX  CX  AX ret
                ; s16:  pixadr 100 9?? -18   0 (result) ..X..Y 100 9??  -2   0  cs   T   0   0
  fild dword[bx-12] ; Z=65534
  fild word[bx-18]  ; Y
  fild word[bx-19]  ; X   |rD.x rD.y rD.z

  mov di,104h
  pusha
  mov bx,bp
  call STORE_DOT    ;V=rD (unnormalized);  |rD*rD
  fsqrt
  fld1
  fdivrp st1        ;|rsqrt(rD*rD)
  call LOAD_SCALE   ;|rd.x rd.y rd.z

  mov bx,PN-0x100
  call STORE_DOT    ;V=rd   |D=pn*rd
  mov bp,RO-0x100
  call DOT
  popa

  fisubr word[bx-28];PD   ;|N=pd-pn*ro D
  fdivrp            ;|t
  ftst              ;set carry flag if t<0
  fnstsw ax
  sahf
  call LOAD_SCALE   ;|t*rd.x t*rd.y t*rd.z
  
                    ; h = {ro + rd*t}; we need only x and z
  fistp word[bx-24] ; pushed_dx = s16(h.x)
  fstp st0
  ;fadd dword[byte RO+8 + si-100h]
  fiadd word[bx-6];T
  fistp word[bx-22] ; pushed_cx = s16(h.z + T)

  popa
  jb  S
  xor cx,dx
  and cl,8
  db 0xA9 ; skip 2 bytes (test ax,NN)
S:mov cl,0xF0
                ;DI:ax = adr += 0000CCCD
  add ax,0xCCCD ;                 xxxx
  adc [bx-18],bx;               yyyy  

  inc di
  mov [es:di],cl
  jnz X         ; the test can also be about AX==0
  popa
  inc cx  ; T++
  in al,60h
  cmp al,1
  jnz M
 ;ret     ; fallthrough

;LOAD: fld1 | call
LOAD_SCALE: ; v3* bp+si, |k --> k*x k*y k*z
  fld dword[bp+di]
  fmul st1           ;|ky k
  fld dword[bp+si+8]
  fmul st2
  fxch st2           ;|k ky kz
  fmul dword[bp+si]  ;|kx ky kz
  ret

;STORE: call | fstp st0
STORE_DOT: ; v3* bp+si, |x y z -->
  fstp dword[bp+si]
  fstp dword[bp+di]
  fstp dword[bp+si+8]

DOT:  ; v3* bp+si, v3* bx+si, | --> ax*bx+ay*by+az*bz
  fld dword[bp+si]
  fmul dword[bx+si]
  fld dword[bp+di]
  fmul dword[bx+di]
  faddp
  fld dword[bp+si+8]
  fmul dword[bx+si+8]
  faddp
  ret
